{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e05d20a-f3e2-436b-9354-78c331e62f30",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from linearmodels import PanelOLS\n",
    "from linearmodels.panel import PooledOLS\n",
    "import statsmodels.api as sm\n",
    "from tabulate import tabulate\n",
    "import os\n",
    "import pandas_market_calendars as mcal\n",
    "from pytz import timezone\n",
    "import textdistance\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import Levenshtein\n",
    "\n",
    "cd_data = '.../Data/Daily/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef4e4ada-32e7-46da-b9bb-84136d127626",
   "metadata": {},
   "outputs": [],
   "source": [
    "#  Congress tweets \n",
    "df_firm_daily = pd.read_parquet(cd_data + 'Congress_tweets_daily_username.parquet')\n",
    "df_firm_daily['date']      = pd.to_datetime(df_firm_daily['date'])\n",
    "\n",
    "# Member identifier in the Biographical Directory of Congress\n",
    "link_twitter_CongressMemberID =pd.read_parquet(cd_data + 'link_twitter_CongressMemberID.parquet')\n",
    "df_firm_daily = pd.merge(df_firm_daily, link_twitter_CongressMemberID, how = 'left', on = ['usernameTweet'])\n",
    "\n",
    "#  Stock returns \n",
    "df_returns = pd.read_parquet(cd_data + 'Returns_daily.parquet')\n",
    "df_returns['date'] = pd.to_datetime(df_returns['date']) \n",
    "\n",
    "df = pd.merge(df_firm_daily, df_returns, how = 'left', on =['permno','date'] )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39cb2db3-fc41-49fc-8193-4428c0dd04af",
   "metadata": {},
   "outputs": [],
   "source": [
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Firm-level controls\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#   News articles concering the target companies\n",
    "\n",
    "df_news_daily = pd.read_parquet(cd_data + 'News_headlines_daily.parquet')\n",
    "df_news_daily['date']      = pd.to_datetime(df_news_daily['date'])\n",
    "df = pd.merge(df, df_news_daily, how = 'left', on =['permno','date'] )\n",
    "df['toneNews'] = df['toneNews'].fillna(0)\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Tweets from major news media outlets\n",
    "df_NewsTweets_daily = pd.read_parquet(cd_data + 'News_tweets_daily.parquet')\n",
    "df_NewsTweets_daily['date']      = pd.to_datetime(df_NewsTweets_daily['date'])\n",
    "df = pd.merge(df, df_NewsTweets_daily, how = 'left', on =['permno','date'] )\n",
    "df['toneNewsTweets'] = df['toneNewsTweets'].fillna(0)\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "# Bloomberg sentiment measure\n",
    "df_Bloomberg_daily = pd.read_parquet(cd_data + 'BloombergSentiment_daily.parquet')\n",
    "df_Bloomberg_daily['date']      = pd.to_datetime(df_Bloomberg_daily['date'])\n",
    "df = pd.merge(df, df_Bloomberg_daily, how = 'left', on =['permno','date'] )\n",
    "df['BloombergSentiment'] = df['BloombergSentiment'].fillna(0)\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "# Forecast errors firm and analysts: Revenue and EPS\n",
    "df_FE_daily = pd.read_parquet(cd_data + 'ForecastErrors_daily.parquet')\n",
    "df_FE_daily['date']      = pd.to_datetime(df_FE_daily['date'])\n",
    "df = pd.merge(df, df_FE_daily, how = 'left', on =['permno','date'] )\n",
    "df[['FE_SAL','FE_EPS','FE_FIRM_SAL','FE_FIRM_EPS']] = df[['FE_SAL','FE_EPS','FE_FIRM_SAL','FE_FIRM_EPS']].fillna(0)\n",
    "\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "# Forecast revisions firm and analysts: Revenue and EPS\n",
    "df_FR_daily = pd.read_parquet(cd_data + 'ForecastRevision_daily.parquet')\n",
    "df_FR_daily['date']      = pd.to_datetime(df_FR_daily['date'])\n",
    "df = pd.merge(df, df_FR_daily, how = 'left', on =['permno','date'] )\n",
    "df[['FR_SAL','FR_EPS']] = df[['FR_SAL','FR_EPS']] .fillna(0)\n",
    "\n",
    "#  Macro-level controls\n",
    "#----------------------------------------------------------------------------------------------------------------\n",
    "#  Macroeconomic announcements \n",
    "df_macro = pd.read_parquet(cd_data + 'macro_controls_daily.parquet')\n",
    "df_macro['date'] = pd.to_datetime(df_macro['date'])\n",
    "df = pd.merge(df, df_macro[['date','surprise_S']], how = 'left', on =['date'] ) \n",
    "df['surprise_S'] = df['surprise_S'] .fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e030923-cf7c-439b-8d76-7d373142a8c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add information on committee meetings\n",
    "\n",
    "# From Charles Stewart's Congressional Data Page\n",
    "df_committee_mem = pd.read_parquet(cd_data + 'SenateHouseCommitteeInfo.parquet')\n",
    "\n",
    "# Assign a member to a committee at the time of the tweet\n",
    "def get_extra_value(date, column_select):\n",
    "    matching_rows = df_committee.loc[(date >= df_committee[\"Date of Appointment\"]) & (date <= df_committee[\"Date of Termination\"]), column_select]\n",
    "    if len(matching_rows) > 0:\n",
    "        return matching_rows.iloc[0]\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "    \n",
    "select_variables =  ['congress',\n",
    "                     'icpsr',\n",
    "                     'chamber',\n",
    "                     'Maj/Min',                \n",
    "                     'Rank Within Party',              \n",
    "                     'Senior Party Member',\n",
    "                     'cmte_senior',\n",
    "                     'Committee Period of Service',\n",
    "                     'Committee status at end of this Congress',\n",
    "                     'Committee continuity of assignment in next Congress',\n",
    "                     'cmte_name',\n",
    "                     'State Name',\n",
    "                     'member',\n",
    "                     'cmte_senior_max']\n",
    "\n",
    "\n",
    "icpsr_unique = list(df['icpsr'].unique())\n",
    "\n",
    "df_icpsr_merged = pd.DataFrame()\n",
    "for icpsr_i in icpsr_unique:\n",
    "\n",
    "    df_select =   df[df['icpsr']  == icpsr_i].copy()\n",
    "    df_committee =   df_committee_mem[df_committee_mem['icpsr']  == icpsr_i].copy()\n",
    "\n",
    "    for index_var in select_variables:\n",
    "        df_select[index_var] = df_select['date'].apply(lambda x: get_extra_value(x, index_var))\n",
    "\n",
    "    df_icpsr_merged  = pd.concat([df_icpsr_merged,df_select ]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9689d2de-dfde-41c8-978b-6cf467074645",
   "metadata": {},
   "outputs": [],
   "source": [
    "# similarity between committee meeting and tweet\n",
    "def calculate_similarities(df, title_column, text_column):\n",
    "    # Convert columns to string type\n",
    "    df[title_column] = df[title_column].astype(str)\n",
    "    df[text_column] = df[text_column].astype(str)\n",
    "    \n",
    "    # Calculate Jaccard similarity\n",
    "    df['similarity_Jaccard'] = df.apply(lambda row: textdistance.jaccard.normalized_similarity(row[text_column], row[title_column]), axis=1)\n",
    "    df['similarity_Jaccard'] = np.where(df[title_column] == 'nan', np.nan, df['similarity_Jaccard'])\n",
    "    df['similarity_Jaccard'] = np.where(df['similarity_Jaccard'] > df['similarity_Jaccard'].median(), 1, 0)\n",
    "\n",
    "    # Preprocess text data\n",
    "    df[title_column] = df[title_column].str.lower()\n",
    "    df[text_column] = df[text_column].str.lower()\n",
    "\n",
    "    # Calculate TF-IDF vectors for column 1\n",
    "    vectorizer = TfidfVectorizer()\n",
    "    tfidf_vectors_1 = vectorizer.fit_transform(df[title_column])\n",
    "\n",
    "    # Calculate TF-IDF vectors for column 2\n",
    "    tfidf_vectors_2 = vectorizer.transform(df[text_column])\n",
    "\n",
    "    # Compute cosine similarity\n",
    "    similarity_scores = cosine_similarity(tfidf_vectors_1, tfidf_vectors_2)\n",
    "\n",
    "    # Add similarity scores to DataFrame\n",
    "    df['similarity_TFIDF'] = similarity_scores.diagonal()\n",
    "    df['similarity_TFIDF'] = np.where(df[title_column] == 'nan', np.nan, df['similarity_TFIDF'])\n",
    "    df['similarity_TFIDF'] = np.where(df['similarity_TFIDF'] > df['similarity_TFIDF'].median(), 1, 0)\n",
    "    \n",
    "    # Calculate Levenshtein distance\n",
    "    df['similarity_Levenshtein'] = df.apply(lambda row: Levenshtein.distance(row[title_column], row[text_column]), axis=1)\n",
    "    df['similarity_Levenshtein'] = np.where(df[title_column] == 'nan', np.nan, df['similarity_Levenshtein'])\n",
    "    df['similarity_Levenshtein'] = np.where(df['similarity_Levenshtein'] > df['similarity_Levenshtein'].median(), 1, 0)\n",
    "    \n",
    "    return df[['similarity_Jaccard', 'similarity_TFIDF', 'similarity_Levenshtein']]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "93541b84-8530-4999-9d3d-19356af506a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# information on committee meetings\n",
    "df_meetings = pd.read_parquet( cd_data + 'Committee_meetings.parquet')\n",
    "df_meetings['date'] = pd.to_datetime(df_meetings['date'] )\n",
    "df_meetings['date_bd'] = pd.to_datetime(df_meetings['date_bd'] )  \n",
    "\n",
    "\n",
    "tweet_days = list(df_icpsr_merged['date'].unique())\n",
    "committee_days = list(df_meetings['date'].unique())\n",
    "\n",
    "days_tweets_committee = list(set(tweet_days).intersection(set(committee_days)))\n",
    "days_tweets_no_committee = list(set(tweet_days).difference(set(committee_days)))\n",
    "\n",
    "# tweets but no committee meeting on the day\n",
    "df_tweets_not_committee = df_icpsr_merged[df_icpsr_merged['date'].isin(days_tweets_no_committee)].copy()\n",
    "df_tweets_not_committee['index_committee_meeting'] = 0\n",
    "df_tweets_not_committee['index_belongs_committee'] = 0\n",
    "df_tweets_not_committee['similarity_Jaccard'] = 0\n",
    "df_tweets_not_committee['similarity_TFIDF'] = 0\n",
    "df_tweets_not_committee['similarity_Levenshtein'] = 0\n",
    "\n",
    "\n",
    "# tweets during committee meetings\n",
    "df_tweets_committee_temp = df_icpsr_merged[df_icpsr_merged['date'].isin(days_tweets_committee)].copy()\n",
    "df_tweets_committee_temp['index_committee_meeting'] = 1\n",
    "\n",
    "df_tweets_committee = pd.DataFrame()\n",
    "for index_d in days_tweets_committee:\n",
    "    index_d\n",
    "    \n",
    "    df_day_i = df_tweets_committee_temp[df_tweets_committee_temp['date'] == index_d].copy()\n",
    "\n",
    "    # committees having a meeting on this day   \n",
    "    df_committee_i = df_meetings[df_meetings['date'] == index_d]\n",
    "    meetings_i = list(df_committee_i['cmte_name_meeting'].unique())\n",
    "    \n",
    "    \n",
    "    df_day_i['index_belongs_committee'] =  np.where(df_day_i['cmte_name'].isin(meetings_i), 1, 0)\n",
    "    \n",
    "    # For those that belong to the committee meeting get text similary measures\n",
    "    df_day_i = pd.merge(df_day_i, df_committee_i[['cmte_name_meeting','title','bill_number']], how = 'left', left_on = 'cmte_name', right_on ='cmte_name_meeting')\n",
    "    df_day_i[['similarity_Jaccard', 'similarity_TFIDF', 'similarity_Levenshtein']] = calculate_similarities(df_day_i, 'title', 'text')\n",
    "    \n",
    "    df_tweets_committee = pd.concat([df_tweets_committee,df_day_i])\n",
    "\n",
    "\n",
    "df_tweets    = pd.concat([df_tweets_committee,df_tweets_not_committee])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15a27f6b-5c3c-446b-91b8-91d1a46e3a4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regression\n",
    "df_reg = df_tweets.copy()\n",
    "\n",
    "df_reg['index_committee_meeting'] = np.where( (df_reg['index_committee_meeting']  == 1), 1, 0)\n",
    "df_reg['index_belongs_committee'] = np.where( (df_reg['index_belongs_committee']  == 1), 1, 0)\n",
    "df_reg['index_high_similarity'] = np.where( (df_reg['similarity_TFIDF']  == 1) &  (df_reg['index_belongs_committee']  == 1) , 1, 0)\n",
    "\n",
    "df_reg['Tone_Meeting'] = np.where(df_reg['index_committee_meeting']  == 1,df_reg['tone'], 0)\n",
    "df_reg['Tone_Meeting_committee'] = np.where(df_reg['index_belongs_committee']  == 1,df_reg['tone'], 0)\n",
    "df_reg['Tone_Meeting_committee_high'] = np.where(df_reg['index_high_similarity']  == 1,df_reg['tone'], 0)\n",
    "\n",
    "\n",
    "# regression\n",
    "return_select = ['exret_ff4']\n",
    "exog_vars = ['index_committee_meeting','index_belongs_committee','index_high_similarity', 'tone', 'Tone_Meeting','Tone_Meeting_committee','Tone_Meeting_committee_high']\n",
    "controls = ['toneNews','toneNewsTweets','BloombergSentiment',\n",
    "           'FE_SAL','FE_EPS','FE_FIRM_SAL','FE_FIRM_EPS',\n",
    "            'ILLIQ','ret_lag','surprise_S']\n",
    "index = ['permno', 'date']\n",
    "\n",
    "\n",
    "df_reg = df_reg[return_select + exog_vars + index + controls].dropna()\n",
    "df_reg = df_reg.set_index(index)    \n",
    "y =  df_reg[return_select]*100*100 \n",
    "x = sm.add_constant(df_reg[exog_vars + controls])\n",
    "mod = PanelOLS(y, x , entity_effects=True)\n",
    "results = mod.fit(cov_type='clustered', cluster_entity=True,cluster_time = True)\n",
    "params = results.params\n",
    "tvalues = results.tstats\n",
    "r2 = results.rsquared*100\n",
    "nobs = results.nobs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8050c34-9060-4eeb-8742-d63a6a52c6cb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
